library(data.table)
library(magrittr)
## Warning: package 'magrittr' was built under R version 4.0.5
library(kableExtra)
library(ggplot2); theme_set(theme_bw(base_size = 16))
library(patchwork)
Single-cell samples of human PBMCs that were aligned with CellRanger.
There are 3 donors and 3 treatments for each donor, i.e. 9 samples in total:
We are going to read in the .h5 files generated by
CellRanger.
Details about h5 format are here
I used Seurat’s function for reading the H5 data (see
readingH5.R).
options(menu.graphics=FALSE)
#library(SingleCellExperiment);library(Matrix)
data_dir <- "2022-04_Dennis/data/"
source("readingH5.R")
smpls <- c("029-1","029-3","029-4","5011-1","5011-3","5011-4","5334-1","5334-3","5334-4")
smptab <- data.frame(filename=smpls,
sample=c(
paste("B029",c("DMSO","IL15","IL15HOD"), sep="_"),
paste("OM5011",c("DMSO","IL15","IL15HOD"), sep="_"),
paste("OM5334",c("DMSO","IL15","IL15HOD"), sep="_"))
)
scel <- lapply(smpls, function(x){
mysmp <- subset(smptab, filename == x)$sample
print(paste("Reading CellRanger output for", x))
retsce <- Read10X_h5_2SCE(paste0(data_dir, "JonesLab_IL15J_10Xdata/", x, "/filtered_feature_bc_matrix.h5"), mysmp)
return(retsce)
})
names(scel) <- smptab$sample
full_data <- do.call(cbind, lapply(scel, counts))
cell_info <- do.call(rbind, lapply(scel, colData))
gene_info <- rowData(scel[[1]])
## combine in one object
sce.all <- SingleCellExperiment(
list(counts = full_data),
rowData = gene_info,
colData = cell_info,
metadata = list(Samples = names(scel))
)
rm(scel); gc()
## remove completely uncovered genes
gnszero <- Matrix::rowSums(counts(sce.all)) == 0
sce.all <- sce.all[!gnszero, ]
#> dim(sce.all)
# [1] 22374 46086
## add QC info
is.mito <- grepl("mt-", ignore.case = TRUE, rowData(sce.all)$Symbol)
sce.all <- scuttle::addPerCellQC(sce.all, subsets=list(mitochondrial=is.mito))
mm <- lapply(unique(sce.all$Sample), function(x){
ss <- sce.all[, sce.all$Sample == x]$subsets_mitochondrial_percent
scuttle::isOutlier(ss, type = "higher")})
sce.all$mito.discard <- unlist(mm)
sce.all$mito.discard <- ifelse(is.na(sce.all$mito.discard), FALSE, sce.all$mito.discard)
## determine GENE count thresholds for each Sample individually
gg <- lapply(unique(sce.all$Sample), function(x){
ss <- log10(sce.all[, sce.all$Sample == x]$detected)
scuttle::isOutlier(ss)
})
sce.all$gene.discard <- unlist(gg)
## save colData
library(data.table);library(magrittr)
cd <-colData(sce.all)
cd$cell <- rownames(cd)
cd <- as.data.frame(cd) %>% as.data.table
saveRDS(cd, file = paste0("colData_unfiltered_", Sys.Date(), ".rds"))
saveRDS(sce.all, file = paste0("sce_unfiltered_", Sys.Date(), ".rds"))
I typically do iterative rounds of filtering, starting with scuttle’s outlier detection and double-checking the distributions and comparing them across samples. This is necessary because different sample types will have different types of QC characteristics, e.g. blood cells are usually fairly small and contain little RNA, which is very different when dealing, for example, with hepatocytes or other metabolically heavily active cells.
## load the colData locally for Rmd
qcall <- readRDS(paste0(data_dir, "colData_unfiltered_2022-05-03.rds"))
qcall$condition <- gsub("_.*","", qcall$Sample)
qcall$replicate <-gsub("M", "", gsub(".*_","",qcall$Sample))
for(i in unique(qcall$condition)){
dt <- qcall[condition == i]
P1 <- ggplot(dt,
aes(x = Sample, y = subsets_mitochondrial_percent, color = mito.discard)) +
ggbeeswarm::geom_quasirandom(size = .2, alpha = .3, shape = 1) +
theme(axis.text.x = element_text(angle=90, vjust=0.5, size=14)) +
scale_color_manual(values = rev(c("firebrick2","grey75"))) +
ylab("% mitochondrial genes") + coord_cartesian(ylim = c(0,20)) +
theme(legend.position = "bottom")+
guides(colour = guide_legend(override.aes = list(alpha = 1, size = 3, shape = 19)))
P2 <- ggplot(dt,
aes(x = subsets_mitochondrial_percent, y = log10(detected),
color = mito.discard)) +
geom_point(size = .2, shape = 1 , alpha = .5) +
facet_wrap(~Sample, ncol = 2) +
scale_color_manual(values = rev(c("firebrick2","grey75"))) +
xlab("% mitochondrial genes") + ylab("log10(total number of genes)")+
theme(legend.position = "bottom")+
guides(colour = guide_legend(override.aes = list(alpha = 1, size = 3, shape = 19)))
pw <- P1 | P2
pw <- pw + plot_annotation(title = i) + plot_layout(widths = c(1.5,3))
print(pw)
}
Let’s check which cells are indicated to be removed based on the gene content. Here, we show only those cells that survive the mito-filter above:
for(i in unique(qcall$condition)){
dt <- qcall[condition == i & mito.discard != TRUE]
ymax <- max(dt$subsets_mitochondrial_percent)
P1 <- ggplot(dt,
aes(x = Sample, y = subsets_mitochondrial_percent, color = gene.discard)) +
ggbeeswarm::geom_quasirandom(size = .2, alpha = .3, shape = 1) +
theme(axis.text.x = element_text(angle=90, vjust=0.5, size=14)) +
scale_color_manual(values = rev(c("blue","grey75"))) +
ylab("% mitochondrial genes") + coord_cartesian(ylim = c(0,ymax)) +
theme(legend.position = "bottom") +
guides(colour = guide_legend(override.aes = list(alpha = 1, size = 3, shape = 19)))
P2 <- ggplot(dt,
aes(x = subsets_mitochondrial_percent, y = log10(detected),
color = gene.discard)) +
geom_point(size = .2, shape = 1 , alpha = .5) +
facet_wrap(~Sample, ncol = 2) +
scale_color_manual(values = rev(c("blue","grey75"))) +
xlab("% mitochondrial genes") + ylab("log10(total number of genes)")+
theme(legend.position = "bottom") +
guides(colour = guide_legend(override.aes = list(alpha = 1, size = 3, shape = 19)))
pw <- P1 | P2
pw <- pw + plot_annotation(title = i, subtitle = "Cells that will be discarded due to the number of genes") + plot_layout(widths = c(1.5,3))
print(pw)
}
I think, I will disregard scuttle’s gene.discard filter, and just remove those with very few genes:
Will go with scuttle’s outlier detection for the mito content and with the above mentioned settings for the gene content.
# with scuttle's outlier detection
table(qcall$Sample, qcall$gene.discard)
FALSE TRUE
B029_DMSO 4449 374
B029_IL15 3394 145
B029_IL15HOD 4992 177
OM5011_DMSO 3191 389
OM5011_IL15 4358 236
OM5011_IL15HOD 2760 202
OM5334_DMSO 2850 335
OM5334_IL15 5490 392
OM5334_IL15HOD 12083 269
qcall[, min_genes := ifelse(Sample %in% c("OM5011_DMSO", "OM5011_IL15","OM5011_IL15HOD", "OM5334_DMSO","OM5334_IL15"), 2.75,
ifelse(grepl("^B029", qcall$Sample), 2.6, 2.5))]
qcall[ , gene.discard := ifelse(log10(detected) <= min_genes, TRUE, FALSE)]
qcall[, rm_cell := ifelse(mito.discard == TRUE | gene.discard == TRUE, TRUE, FALSE)]
> table(qcall$Sample, qcall$gene.discard)
FALSE TRUE
B029_DMSO 4752 71
B029_IL15 3502 37
B029_IL15HOD 5130 39
OM5011_DMSO 3391 189
OM5011_IL15 4404 190
OM5011_IL15HOD 2791 171
OM5334_DMSO 3060 125
OM5334_IL15 5586 296
OM5334_IL15HOD 12320 32
How many cells are going to be removed per sample?
table(qcall$rm_cell, qcall$Sample)
##
## B029_DMSO B029_IL15 B029_IL15HOD OM5011_DMSO OM5011_IL15 OM5011_IL15HOD
## FALSE 4601 3403 5000 3359 4343 2725
## TRUE 222 136 169 221 251 237
##
## OM5334_DMSO OM5334_IL15 OM5334_IL15HOD
## FALSE 2966 5428 12074
## TRUE 219 454 278
ggplot(qcall, aes(x = Sample, fill = rm_cell)) +geom_bar() + coord_flip() +
# scale_color_manual(values = c("midnightblue","dodgerblue3","dodgerblue1")) +
scale_fill_manual(values = c("grey75","orange")) +
ylab("# cells") + ggtitle("How many cells will be removed per sample?")
for(i in unique(qcall$condition)){
dt <- qcall[condition == i & mito.discard!=TRUE]
ymax <- max(dt$subsets_mitochondrial_percent)
P1 <- ggplot(dt,
aes(x = Sample, y = subsets_mitochondrial_percent, color = rm_cell)) +
ggbeeswarm::geom_quasirandom(size = .2, alpha = .3, shape = 1) +
theme(axis.text.x = element_text(angle=90, vjust=0.5, size=14)) +
scale_color_manual(values = rev(c("orange","grey75"))) +
ylab("% mitochondrial genes") + coord_cartesian(ylim = c(0,ymax)) +
theme(legend.position = "bottom") +
guides(colour = guide_legend(override.aes = list(alpha = 1, size = 3, shape = 19)))
P2 <- ggplot(dt,
aes(x = subsets_mitochondrial_percent, y = log10(detected),
color = rm_cell)) +
geom_point(size = .2, shape = 1 , alpha = .5) +
facet_wrap(~Sample, ncol = 2) +
scale_color_manual(values = rev(c("orange","grey75"))) +
xlab("% mitochondrial genes") + ylab("log10(total number of genes)")+
theme(legend.position = "bottom") +
guides(colour = guide_legend(override.aes = list(alpha = 1, size = 3, shape = 19)))
pw <- P1 | P2
pw <- pw + plot_annotation(title = i) + plot_layout(widths = c(1.5,3))
print(pw)
}
# save the colData to be added back to the SCE
saveRDS(qcall, file = paste0(data_dir, "colData_filtered_2022-05-03.rds"))
#$ scp colData_filtered_2022-05-03.rds frd2007@redteam2.pbtech:/scratchLocal/frd2007/2022-04_Dennis/2022-04-26_ReadingDataIn/
## on server
#sce.all = readRDS("sce_unfiltered_2022-05-03.rds")
cd <- colData(sce.all)
cd$cell = row.names(cd)
cd <- as.data.frame(cd) %>% as.data.table
qcall <- readRDS("colData_filtered_2022-05-03.rds")
qcall <- qcall[, c("Sample","Barcodes","cell","rm_cell", "min_genes")] %>% .[cd, on = c("Sample", "Barcodes","cell")]
qcall.df <- DataFrame(as.data.frame(qcall))
rownames(qcall.df) <- qcall.df$cell
colData(sce.all) <- qcall.df[colnames(sce.all),]
## remove cells
sce.all <- sce.all[, !sce.all$rm_cell]
gnszero <- Matrix::rowSums(counts(sce.all)) == 0
sce.all <- sce.all[!gnszero, ]
#> dim(sce.all)
#[1] 22266 43899
saveRDS(sce.all, file = "sce_filtered_2022-05-03.rds")
library(Matrix);library(scran);library(BiocParallel)
#library(SingleCellExperiment);
#sce = readRDS("sce_filtered_2022-03-25.rds")
hs.pairs <- readRDS(system.file("exdata", "human_cycle_markers.rds", package="scran"))
cc.noNormAllCells <- list()
samples = unique(sce.all$Sample)
for(i in samples){
set.seed(123)
message(i)
p = bpstart(MulticoreParam(12))
sc.tmp <- sce.all[, sce.all$Sample == i]
cc.noNorm <- scran::cyclone(sc.tmp, pairs=hs.pairs, BPPARAM=p)
names(cc.noNorm$phases) <- colnames(sc.tmp)
row.names(cc.noNorm$scores) <- colnames(sc.tmp)
row.names(cc.noNorm$normalized.scores) <- colnames(sc.tmp)
cc.noNormAllCells[[i]] <- cc.noNorm
rm(sc.tmp)
rm(cc.noNorm)
gc()
}
phs <- lapply(cc.noNormAllCells, function(x) x$phases) %>% unlist
names(phs) <- sub(".*?\\.","", names(phs))
colData(sce.all)$cc_phase <- phs[colnames(sce.all)]
g1phs <- lapply(cc.noNormAllCells, function(x) x$score$G1) %>% unlist
#names(g1phs) <- sub(".*?\\.","", names(g1phs))
colData(sce.all)$G1score <- g1phs
g2mphs <- lapply(cc.noNormAllCells, function(x) x$score$G2M) %>% unlist
colData(sce.all)$G2Mscore <- g2mphs
##!saveRDS(cc.noNormAllCells, file="cc.noNormAllCells.rds")
##!saveRDS(sce.all, file="sce_filtered_2022-05-03.rds")
#rownames(sce.all) <- scater::uniquifyFeatureNames(rowData(sce.all)$ID, #rowData(sce.all)$Symbol)
source("/scratchLocal/frd2007/2022-04_Dennis/src/Mnn.R")
library(SingleCellExperiment); library(scater);library(scran);library(batchelor);library(magrittr)
## load previously filtered sample
scf <- readRDS("sce_filtered_2022-05-03.rds")
#scf <- sce.all
#rm(sce.all);gc()
## getting a list ----------------------
scel <- lapply(unique(scf$Sample), function(x){
individualize_samples(scf[, scf$Sample == x], pf = "allInteg",
shared_colData = names(colData(scf)))})
names(scel) <- unique(scf$Sample)
## MNN-correction, UMAPing, clustering --> see Mnn.R for details----------------
## First, do integration with all genes, including ery-genes:
scf.mnn <- MNN_correct(scel, hvg_n = 2500,
shared_colData = c("Sample", "Barcodes","cell"),
fn="Dennis_sampleIntegration_allGenes_2022-05-03",
cluster_ks = c(50,100,150,200,250,300),
save_hvg = TRUE, ignore_genes=NULL)
rownames(scf.mnn) <- scater::uniquifyFeatureNames(rowData(scf.mnn)$ID, rowData(scf.mnn)$Symbol)
saveRDS(scf.mnn, file = "sce_Dennis_sampleIntegration_2022-05-03.rds")
scf.mnn.nc <- scf.mnn
counts(scf.mnn.nc) <- NULL
saveRDS(scf.mnn.nc, file = "sce_Dennis_sampleIntegration_2022-05-03_noCounts.rds")
TENxPBMCDatalibrary(celldex)
library(SingleR)
scf.mnn <-readRDS(file = "sce_Dennis_sampleIntegration_2022-05-03.rds")
rownames(scf.mnn) <- rowData(scf.mnn)$ID
hpca <- celldex::HumanPrimaryCellAtlasData(ensembl=TRUE)
## Performing predictions
predictedLabels <- SingleR(
test=scf.mnn, assay.type.test="logcounts",
ref=hpca,
labels=hpca$label.main)
##!save(predictedLabels, file = "SingleR_HPCALabels.rda")
scf.mnn$labPredict_HPCA <- predictedLabels$pruned.labels
scf.mnn$labPredict_HPCA_prePrune <- predictedLabels$labels
rownames(scf.mnn) <- scater::uniquifyFeatureNames(ID=rowData(scf.mnn)$ID, names = rowData(scf.mnn)$Symbol)
saveRDS(scf.mnn, file = "sce_Dennis_sampleIntegration_2022-05-03.rds")
scf.mnn.nc <- scf.mnn
counts(scf.mnn.nc) <- NULL
saveRDS(scf.mnn.nc, file = "sce_Dennis_sampleIntegration_2022-05-03_noCounts.rds")
## labels assigned to our data
> table(scf.mnn$labPredict_HPCA)
B_cell CMP DC GMP
1658 49 598 10
HSC_-G-CSF Macrophage MEP Monocyte
1 273 10 2879
Neutrophils NK_cell Pre-B_cell_CD34- Pro-B_cell_CD34+
222 7935 36 4
T_cells
30091
## labels present in the reference data set
> table(hpca$label.main)
Astrocyte B_cell BM
2 26 7
BM & Prog. Chondrocytes CMP
1 8 2
DC Embryonic_stem_cells Endothelial_cells
88 17 64
Epithelial_cells Erythroblast Fibroblasts
16 8 10
Gametocytes GMP Hepatocytes
5 2 3
HSC_-G-CSF HSC_CD34+ iPS_cells
10 6 42
Keratinocytes Macrophage MEP
25 90 2
Monocyte MSC Myelocyte
60 9 2
Neuroepithelial_cell Neurons Neutrophils
1 16 21
NK_cell Osteoblasts Platelets
5 15 5
Pre-B_cell_CD34- Pro-B_cell_CD34+ Pro-Myelocyte
2 2 2
Smooth_muscle_cells T_cells Tissue_stem_cells
16 68 55
R version 4.1.2 (2021-11-01)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Red Hat Enterprise Linux Server release 6.3 (Santiago)
Matrix products: default
BLAS/LAPACK: /pbtech_mounts/homes022/frd2007/miniconda3/envs/r4_env/lib/libopenblasp-r0.3.18.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats4 stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] SingleR_1.8.1 celldex_1.4.0
[3] scater_1.22.0 ggplot2_3.3.5
[5] batchelor_1.10.0 hdf5r_1.3.5
[7] magrittr_2.0.3 data.table_1.14.2
[9] BiocParallel_1.28.3 scran_1.22.1
[11] scuttle_1.4.0 SingleCellExperiment_1.16.0
[13] SummarizedExperiment_1.24.0 Biobase_2.54.0
[15] GenomicRanges_1.46.1 GenomeInfoDb_1.30.1
[17] IRanges_2.28.0 S4Vectors_0.32.4
[19] BiocGenerics_0.40.0 MatrixGenerics_1.6.0
[21] matrixStats_0.61.0 Matrix_1.4-1
loaded via a namespace (and not attached):
[1] ggbeeswarm_0.6.0 colorspace_2.0-3
[3] ellipsis_0.3.2 bluster_1.4.0
[5] XVector_0.34.0 BiocNeighbors_1.12.0
[7] ggrepel_0.9.1 bit64_4.0.5
[9] interactiveDisplayBase_1.32.0 AnnotationDbi_1.56.2
[11] fansi_1.0.3 sparseMatrixStats_1.6.0
[13] cachem_1.0.6 ResidualMatrix_1.4.0
[15] cluster_2.1.3 dbplyr_2.1.1
[17] png_0.1-7 shiny_1.7.1
[19] BiocManager_1.30.16 compiler_4.1.2
[21] httr_1.4.2 dqrng_0.3.0
[23] assertthat_0.2.1 fastmap_1.1.0
[25] limma_3.50.1 cli_3.2.0
[27] later_1.3.0 BiocSingular_1.10.0
[29] htmltools_0.5.2 tools_4.1.2
[31] rsvd_1.0.5 igraph_1.3.0
[33] gtable_0.3.0 glue_1.6.2
[35] GenomeInfoDbData_1.2.7 dplyr_1.0.8
[37] rappdirs_0.3.3 Rcpp_1.0.8.3
[39] vctrs_0.4.0 Biostrings_2.62.0
[41] ExperimentHub_2.2.1 DelayedMatrixStats_1.16.0
[43] beachmat_2.10.0 mime_0.12
[45] lifecycle_1.0.1 irlba_2.3.5
[47] statmod_1.4.36 AnnotationHub_3.2.2
[49] edgeR_3.36.0 zlibbioc_1.40.0
[51] scales_1.1.1 promises_1.2.0.1
[53] parallel_4.1.2 yaml_2.3.5
[55] curl_4.3.2 memoise_2.0.1
[57] gridExtra_2.3 RSQLite_2.2.12
[59] BiocVersion_3.14.0 ScaledMatrix_1.2.0
[61] filelock_1.0.2 rlang_1.0.2
[63] pkgconfig_2.0.3 bitops_1.0-7
[65] lattice_0.20-45 purrr_0.3.4
[67] bit_4.0.4 tidyselect_1.1.2
[69] R6_2.5.1 generics_0.1.2
[71] metapod_1.2.0 DelayedArray_0.20.0
[73] DBI_1.1.2 pillar_1.7.0
[75] withr_2.5.0 KEGGREST_1.34.0
[77] RCurl_1.98-1.6 tibble_3.1.6
[79] crayon_1.5.1 utf8_1.2.2
[81] BiocFileCache_2.2.1 viridis_0.6.2
[83] locfit_1.5-9.5 grid_4.1.2
[85] blob_1.2.2 digest_0.6.29
[87] xtable_1.8-4 httpuv_1.6.5
[89] munsell_0.5.0 beeswarm_0.4.0
[91] viridisLite_0.4.0 vipor_0.4.5